1.A. Remember to use getwd() and setwd() to set the working directory in your rmarkdown file. For example, mydir <- getwd() & setwd(mydir)
library(rmarkdown)
library(psych)
library(scatterplot3d)
library(caret)## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Loading required package: lattice
# Import data using read.csv(). Do not coerce the character variables to factors automatically when loading the data. Examine the overall ‘structure’ of the input data.
mydir <- getwd()
setwd(mydir)
balanced <- read.csv(file = "CD_additional_balanced.csv", stringsAsFactors = FALSE)
# Examine the overall 'structure' of the input data
str(balanced)## 'data.frame': 9280 obs. of 21 variables:
## $ age : int 41 49 49 41 45 42 39 28 44 42 ...
## $ job : chr "blue-collar" "entrepreneur" "technician" "technician" ...
## $ marital : chr "divorced" "married" "married" "married" ...
## $ education : chr "basic.4y" "university.degree" "basic.9y" "professional.course" ...
## $ default : chr "unknown" "unknown" "no" "unknown" ...
## $ housing : chr "yes" "yes" "no" "yes" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : chr "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr "may" "may" "may" "may" ...
## $ day_of_week : chr "mon" "mon" "mon" "mon" ...
## $ duration : int 1575 1042 1467 579 461 673 935 1201 1030 1623 ...
## $ campaign : int 1 1 1 1 1 2 3 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : chr "yes" "yes" "yes" "yes" ...
1.B. Transform all of the character variables that include categorical values to factor variables. After this transformation, show the overall ‘structure’ and the ‘summary’ of the input data.
is.factor(as.character(balanced))## [1] FALSE
# Changing character variables to factor
balanced$job <- factor(balanced$job)
balanced$marital <- factor(balanced$marital)
balanced$education <- factor(balanced$education)
balanced$default <- factor(balanced$default)
balanced$housing <- factor(balanced$housing)
balanced$loan <- factor(balanced$loan)
balanced$contact <- factor(balanced$contact)
balanced$month <- factor(balanced$month)
balanced$day_of_week <- factor(balanced$day_of_week)
balanced$poutcome <- factor(balanced$poutcome)
balanced$y <- factor(balanced$y)
# Show the overall 'structure' and the 'summary' of the input data
str(balanced)## 'data.frame': 9280 obs. of 21 variables:
## $ age : int 41 49 49 41 45 42 39 28 44 42 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 2 3 10 10 2 2 4 12 8 10 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 1 2 2 2 2 2 2 3 2 2 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 7 3 6 3 3 3 8 4 6 ...
## $ default : Factor w/ 2 levels "no","unknown": 2 2 1 2 2 1 1 2 1 1 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 3 3 1 3 3 3 3 3 3 1 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 3 1 3 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 4 4 4 ...
## $ duration : int 1575 1042 1467 579 461 673 935 1201 1030 1623 ...
## $ campaign : int 1 1 1 1 1 2 3 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
summary(balanced)## age job marital education
## Min. :17.0 admin. :2517 divorced:1021 university.degree :3007
## 1st Qu.:31.0 blue-collar:1769 married :5338 high.school :2102
## Median :38.0 technician :1459 single :2900 professional.course:1190
## Mean :40.4 services : 773 unknown : 21 basic.9y :1177
## 3rd Qu.:48.0 management : 651 basic.4y : 895
## Max. :98.0 retired : 595 basic.6y : 458
## (Other) :1516 (Other) : 451
## default housing loan contact month
## no :7824 no :4104 no :7688 cellular :6672 may :2533
## unknown:1456 unknown: 225 unknown: 225 telephone:2608 jul :1477
## yes :4951 yes :1367 aug :1353
## jun :1169
## nov : 886
## apr : 785
## (Other):1077
## day_of_week duration campaign pdays previous
## fri:1763 Min. : 1.0 Min. : 1.000 Min. : 0.0 Min. :0.0000
## mon:1846 1st Qu.: 145.0 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.0000
## thu:2000 Median : 265.0 Median : 2.000 Median :999.0 Median :0.0000
## tue:1810 Mean : 387.4 Mean : 2.333 Mean :887.3 Mean :0.3153
## wed:1861 3rd Qu.: 528.0 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.0000
## Max. :4199.0 Max. :39.000 Max. :999.0 Max. :6.0000
##
## poutcome emp.var.rate cons.price.idx cons.conf.idx
## failure :1074 Min. :-3.4000 Min. :92.20 Min. :-50.80
## nonexistent:7244 1st Qu.:-1.8000 1st Qu.:92.89 1st Qu.:-42.70
## success : 962 Median :-0.1000 Median :93.44 Median :-41.80
## Mean :-0.4963 Mean :93.48 Mean :-40.22
## 3rd Qu.: 1.4000 3rd Qu.:93.99 3rd Qu.:-36.40
## Max. : 1.4000 Max. :94.77 Max. :-26.90
##
## euribor3m nr.employed y
## Min. :0.634 Min. :4964 no :4640
## 1st Qu.:1.244 1st Qu.:5076 yes:4640
## Median :4.021 Median :5191
## Mean :2.960 Mean :5135
## 3rd Qu.:4.959 3rd Qu.:5228
## Max. :5.045 Max. :5228
##
2.A. Create a histogram and include a title of the histogram.
# Histogram of Age
hist(balanced$age, main = "Histogram of Age in the CD Additional Balanced data set", xlab = "age")# Histogram of Duration
hist(balanced$duration, main = "Histogram of Duration in the CD Additional Balanced data set", xlab = "duration")# Histogram of Campaign
hist(balanced$campaign, main = "Histogram of Campaign in the CD Additional Balanced data set", xlab = "campaign")# Histogram of Pdays
hist(balanced$pdays, main = "Histogram of Pdays in the CD Additional Balanced data set", xlab = "pdays")2.B. Create a boxplot and include a title in the plot.
# Boxplot of Age
boxplot(balanced$age, main = "Boxplot of Age in the CD Additional Balanced data set", ylab = "age")# Boxplot of Duration
boxplot(balanced$duration, main = "Boxplot of Duration in the CD Additional Balanced data set", ylab = "duration")# Boxplot of Campaign
boxplot(balanced$campaign, main = "Boxplot of Campaign in the CD Additional Balanced data set", ylab = "campaign")# Boxplot of Pdays
boxplot(balanced$pdays, main = "Boxplot of Pdays in the CD Additional Balanced data set", ylab = "pdays")2.C. Show deciles of the variable.
quantile(balanced$age, seq(from = 0, to = 1, by = 0.10))## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 17 27 30 33 35 38 41 46 51 57 98
quantile(balanced$duration, seq(from = 0, to = 1, by = 0.10))## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 1 80 124 167 211 265 340 452 615 860 4199
quantile(balanced$campaign, seq(from = 0, to = 1, by = 0.10))## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 1 1 1 1 1 2 2 2 3 4 39
quantile(balanced$pdays, seq(from = 0, to = 1, by = 0.10))## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 0 11 999 999 999 999 999 999 999 999 999
3.A. For each of the selected factor variables, and for each of the variable’s levels (e.g., “success”, “failure”, “nonexistent” of poutcome), show the count value and percentage value of instances belonging to that level.
Note: Select variable y and three other factor variables (e.g, job, education and poutcome) for this task. Do not include additional variables.
# Count observations having factor value
y.table <- table(balanced$y)
y.table##
## no yes
## 4640 4640
y.perc <- prop.table(table(balanced$y))*100
y.perc##
## no yes
## 50 50
job.table <- table(balanced$job)
job.table##
## admin. blue-collar entrepreneur housemaid management
## 2517 1769 308 216 651
## retired self-employed services student technician
## 595 306 773 358 1459
## unemployed unknown
## 248 80
job.perc <- prop.table(table(balanced$job))*100
job.perc##
## admin. blue-collar entrepreneur housemaid management
## 27.122845 19.062500 3.318966 2.327586 7.015086
## retired self-employed services student technician
## 6.411638 3.297414 8.329741 3.857759 15.721983
## unemployed unknown
## 2.672414 0.862069
education.table <- table(balanced$education)
education.table##
## basic.4y basic.6y basic.9y high.school
## 895 458 1177 2102
## illiterate professional.course university.degree unknown
## 6 1190 3007 445
education.perc <- prop.table(table(balanced$education))*100
education.perc##
## basic.4y basic.6y basic.9y high.school
## 9.64439655 4.93534483 12.68318966 22.65086207
## illiterate professional.course university.degree unknown
## 0.06465517 12.82327586 32.40301724 4.79525862
poutcome.table <- table(balanced$poutcome)
poutcome.table##
## failure nonexistent success
## 1074 7244 962
poutcome.perc <- prop.table(table(balanced$poutcome))*100
poutcome.perc##
## failure nonexistent success
## 11.57328 78.06034 10.36638
3.B. For each of the selected variables, show a bar plot of the number of instances (i.e. count) with a level name for each possible value. Show a descriptive title in each plot.
#Barplot of y
barplot(y.table, main = "Bar Plot of y in the CD Additional Balanced data set",
xlab = "y")#Barplot of job
barplot(job.table, main = "Bar Plot of job in the CD Additional Balanced data set",
xlab = "job")#Barplot of education
barplot(education.table, main = "Bar Plot of education in the CD Additional Balanced data set",
xlab = "education")#Barplot of poutcome
barplot(poutcome.table, main = "Bar Plot of poutcome in the CD Additional Balanced data set",
xlab = "poutcome")4.A Use cor and pairs.panels to display correlations for these seven numeric variables – age, duration, campaign, pdays, euribor3m, emp.var.rate, and nr.employed.
cor_display <- balanced[c("age", "duration", "campaign", "pdays", "euribor3m", "emp.var.rate", "nr.employed")]
# Correlation of variables
cor(cor_display)## age duration campaign pdays euribor3m
## age 1.000000000 -0.02072651 0.003690016 -0.05351616 -0.04462745
## duration -0.020726510 1.00000000 -0.025872465 0.02893622 0.05733951
## campaign 0.003690016 -0.02587247 1.000000000 0.08930062 0.17512283
## pdays -0.053516156 0.02893622 0.089300624 1.00000000 0.38773934
## euribor3m -0.044627449 0.05733951 0.175122827 0.38773934 1.00000000
## emp.var.rate -0.049052629 0.07144035 0.185736186 0.33488799 0.95840218
## nr.employed -0.074686516 0.05823209 0.176972215 0.47499217 0.94054583
## emp.var.rate nr.employed
## age -0.04905263 -0.07468652
## duration 0.07144035 0.05823209
## campaign 0.18573619 0.17697221
## pdays 0.33488799 0.47499217
## euribor3m 0.95840218 0.94054583
## emp.var.rate 1.00000000 0.86752989
## nr.employed 0.86752989 1.00000000
# Display of correlated variables
pairs.panels(cor_display)4.B For each of these numeric variables - duration, emp.var.rate, cons.price.idx, and cons.conf.idx.
The output of aggregate in task will NOT be visible until you knit. You can test your code by copying pasting to the console.
# Boxplot of variables
boxplot(duration~y, data = balanced)boxplot(emp.var.rate~y, data = balanced)boxplot(cons.price.idx~y, data = balanced)boxplot(cons.conf.idx~y, data = balanced)# Aggregate of variables
aggregate(duration~y, summary, data = balanced)## y duration.Min. duration.1st Qu. duration.Median duration.Mean
## 1 no 1.0000 94.0000 166.0000 221.5323
## 2 yes 37.0000 253.0000 449.0000 553.1912
## duration.3rd Qu. duration.Max.
## 1 279.2500 1994.0000
## 2 741.2500 4199.0000
aggregate(emp.var.rate~y, summary, data = balanced)## y emp.var.rate.Min. emp.var.rate.1st Qu. emp.var.rate.Median
## 1 no -3.4000000 -1.8000000 1.1000000
## 2 yes -3.4000000 -1.8000000 -1.8000000
## emp.var.rate.Mean emp.var.rate.3rd Qu. emp.var.rate.Max.
## 1 0.2409052 1.4000000 1.4000000
## 2 -1.2334483 -0.1000000 1.4000000
aggregate(cons.price.idx~y, summary, data = balanced)## y cons.price.idx.Min. cons.price.idx.1st Qu. cons.price.idx.Median
## 1 no 92.20100 93.07500 93.91800
## 2 yes 92.20100 92.89300 93.20000
## cons.price.idx.Mean cons.price.idx.3rd Qu. cons.price.idx.Max.
## 1 93.60397 93.99400 94.76700
## 2 93.35439 93.91800 94.76700
aggregate(cons.conf.idx~y, summary, data = balanced)## y cons.conf.idx.Min. cons.conf.idx.1st Qu. cons.conf.idx.Median
## 1 no -50.80000 -42.70000 -41.80000
## 2 yes -50.80000 -46.20000 -40.40000
## cons.conf.idx.Mean cons.conf.idx.3rd Qu. cons.conf.idx.Max.
## 1 -40.64647 -36.40000 -26.90000
## 2 -39.78978 -36.10000 -26.90000
4.C Draw a 3d scatter plot to show y values in shapes (e.g. circle for “no”, triangle for “yes”) for each of the following combinations of numeric variables (along the three axes). Include a main title for the plot and legend for the shapes of y in the plot. (i) age, campaign and duration (ii) nr.employed, euribor3m and duration
# Scatterplot 3D
scatterplot3d(balanced$age,balanced$campaign,balanced$duration, pch = as.numeric(balanced$y), main = "3D scatter plot of age, campaign and duration in the CD Additional Balanced data")
legend('topright', legend = levels(balanced$y), cex = 0.8, pch = 1:2)# Scatterplot 3D
scatterplot3d(balanced$nr.employed,balanced$euribor3m,balanced$duration, pch = as.numeric(balanced$y), main = "3D scatter plot of nr.employed, euribor3m and duration in the CD Additional Balanced data")
legend('topright', legend = levels(balanced$y), cex = 0.8, pch = 1:2)